*** How worried should we be? The implications of fabricated survey data for political science
*** Figure A10. When Enumerators Fabricate First Interview

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "VEN_fraud_data.dta", clear

gen upload_time = Clock(upload, "MDYhm")
format upload_time %tC
gen upload_hour = hhC(upload_time)

gen start_time = Clock(vstart, "MDYhm")
format start_time %tC
gen start_hour = hhC(start_time)

gen start_date = dofC(start_time)
format start_date %td
gen dayofweek = dow(start_date)

gen upload_date = dofC(upload_time)
format upload_date %td
gen dayofweek_upload = dow(upload_date)

  
** interviewer level
drop if status == "Canceled" & likelyfraud == 0
egen fraud_total = total(likelyfraud), by(srvyr)
egen int_total = count(likelyfraud), by(srvyr)

sort srvyr upload_time start_time

by srvyr: gen int_order =  _n

gen int_order_all = int_order
replace int_order = . if likelyfraud == 0

egen first_fake = min(int_order), by(srvyr)

replace colori = . if colori == 999999
replace sexi = . if sexi == 999999
replace colorr = . if colorr == 999999 | colorr == 97

* collapse data to interviewer level 
collapse (min) colori_min=colori (max) colori_max=colori (sd) colori_sd=colori (mean) colori (median)  sexi (first) upload_time int_total fraud_total first_fake, by(srvyr)
gen fraud_rate = fraud_total / int_total
gen clean_total = int_total - fraud_total

lab def sexi 1 "Male" 2 "Female"
lab value sexi sexi
lab var colori "Interviewer Skin Tone"

* Figure A10
hist first_fake, freq discrete scheme(plotplain) ylabel(0(2)12) xtitle("Interviewer's first fraudulent in order" "of all their interviews") saving(figA7.gph, replace)
